Getting data from the web: API access

MACS 30500 University of Chicago

November 13, 2017

Methods for obtaining data online

  • Click and download
  • Install and play
  • API query
  • Scraping

Click and download

  • read.csv or readr::read_csv
  • downloader package or curl

Data supplied on the web

  • Application programming interface (API)
  • Client
  • Server

Install and play packages

  • Packages with R functions written for existing APIs
  • Useful because
    • provenance
    • reproducible
    • updating
    • ease
    • scaling

The Manifesto Project: manifestoR

  • Collects and organizes political party manifestos from around the world
  • Over 1000 parties from 1945 until today in over 50 countries on five continents
  • manifestoR

Load library and set API key

library(manifestoR)

# retrieve API key stored in .Rprofile
mp_setapikey(key = getOption("manifesto_key"))

Retrieve the database

(mpds <- mp_maindataset())
## Connecting to Manifesto Project DB API... 
## Connecting to Manifesto Project DB API... corpus version: 2017-1
## # A tibble: 4,214 x 173
##    country countryname oecdmember eumember      edate   date party
##      <dbl>       <chr>      <dbl>    <dbl>     <date>  <dbl> <dbl>
##  1      11      Sweden          0        0 1944-09-17 194409 11220
##  2      11      Sweden          0        0 1944-09-17 194409 11320
##  3      11      Sweden          0        0 1944-09-17 194409 11420
##  4      11      Sweden          0        0 1944-09-17 194409 11620
##  5      11      Sweden          0        0 1944-09-17 194409 11810
##  6      11      Sweden          0        0 1948-09-19 194809 11220
##  7      11      Sweden          0        0 1948-09-19 194809 11320
##  8      11      Sweden          0        0 1948-09-19 194809 11420
##  9      11      Sweden          0        0 1948-09-19 194809 11620
## 10      11      Sweden          0        0 1948-09-19 194809 11810
## # ... with 4,204 more rows, and 166 more variables: partyname <chr>,
## #   partyabbrev <chr>, parfam <dbl>, coderid <dbl>, manual <dbl>,
## #   coderyear <dbl>, testresult <dbl>, testeditsim <dbl>, pervote <dbl>,
## #   voteest <dbl>, presvote <dbl>, absseat <dbl>, totseats <dbl>,
## #   progtype <dbl>, datasetorigin <dbl>, corpusversion <chr>, total <dbl>,
## #   peruncod <dbl>, per101 <dbl>, per102 <dbl>, per103 <dbl>,
## #   per104 <dbl>, per105 <dbl>, per106 <dbl>, per107 <dbl>, per108 <dbl>,
## #   per109 <dbl>, per110 <dbl>, per201 <dbl>, per202 <dbl>, per203 <dbl>,
## #   per204 <dbl>, per301 <dbl>, per302 <dbl>, per303 <dbl>, per304 <dbl>,
## #   per305 <dbl>, per401 <dbl>, per402 <dbl>, per403 <dbl>, per404 <dbl>,
## #   per405 <dbl>, per406 <dbl>, per407 <dbl>, per408 <dbl>, per409 <dbl>,
## #   per410 <dbl>, per411 <dbl>, per412 <dbl>, per413 <dbl>, per414 <dbl>,
## #   per415 <dbl>, per416 <dbl>, per501 <dbl>, per502 <dbl>, per503 <dbl>,
## #   per504 <dbl>, per505 <dbl>, per506 <dbl>, per507 <dbl>, per601 <dbl>,
## #   per602 <dbl>, per603 <dbl>, per604 <dbl>, per605 <dbl>, per606 <dbl>,
## #   per607 <dbl>, per608 <dbl>, per701 <dbl>, per702 <dbl>, per703 <dbl>,
## #   per704 <dbl>, per705 <dbl>, per706 <dbl>, per1011 <dbl>,
## #   per1012 <dbl>, per1013 <dbl>, per1014 <dbl>, per1015 <dbl>,
## #   per1016 <dbl>, per1021 <dbl>, per1022 <dbl>, per1023 <dbl>,
## #   per1024 <dbl>, per1025 <dbl>, per1026 <dbl>, per1031 <dbl>,
## #   per1032 <dbl>, per1033 <dbl>, per2021 <dbl>, per2022 <dbl>,
## #   per2023 <dbl>, per2031 <dbl>, per2032 <dbl>, per2033 <dbl>,
## #   per2041 <dbl>, per3011 <dbl>, per3051 <dbl>, per3052 <dbl>,
## #   per3053 <dbl>, ...

How many manifestos have been published by each political party in Sweden?

mpds %>%
  filter(countryname == "Sweden") %>%
  count(partyname) %>%
  ggplot(aes(fct_reorder(partyname, n), n)) +
  geom_col() +
  labs(title = "Political manifestos published in Sweden",
       x = NULL,
       y = "Total (1948-present)") +
  coord_flip()

How have the Democratic and Republican Party manifestos in the United States changed over time?

mpds %>%
  filter(party == 61320 | party == 61620) %>%
  mutate(ideo = mp_scale(.)) %>%
  select(partyname, edate, ideo) %>%
  ggplot(aes(edate, ideo, color = partyname)) +
  geom_line() +
  scale_color_manual(values = c("blue", "red")) +
  labs(title = "Ideological scaling of major US political parties",
       x = "Year",
       y = "Ideological position",
       color = NULL) +
  theme(legend.position = "bottom")

Analyze text of manifestos

# download documents
(docs <- mp_corpus(countryname == "United States" & edate > as.Date("2012-01-01")))
## Connecting to Manifesto Project DB API... 
## Connecting to Manifesto Project DB API... corpus version: 2017-1 
## Connecting to Manifesto Project DB API... 
## Connecting to Manifesto Project DB API... corpus version: 2017-1 
## Connecting to Manifesto Project DB API... corpus version: 2017-1 
## Connecting to Manifesto Project DB API... corpus version: 2017-1
## <<ManifestoCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 2
# generate wordcloud of most common terms
docs %>%
  tidy() %>%
  mutate(party = factor(party, levels = c(61320, 61620),
                        labels = c("Democratic Party", "Republican Party"))) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  count(party, word, sort = TRUE) %>%
  na.omit() %>%
  reshape2::acast(word ~ party, value.var = "n", fill = 0) %>%
  comparison.cloud(max.words = 200)

Twitter API

Using twitteR

library(twitteR)

OAuth authentication

  1. Create a Twitter account
  2. Store your API key and token using the .Rprofile method
  3. setup_twitter_oauth() from the console
  4. Get back into RStudio

Searching tweets

setup_twitter_oauth(consumer_key = getOption("twitter_api_key"),
                    consumer_secret = getOption("twitter_api_token"))
## [1] "Using browser based authentication"
tweets <- searchTwitter('#rstats', n = 5)
tweets
## [[1]]
## [1] "RStudioJoe: #rstats Thank you @jkregenstein for a year's worth of illuminating posts on \"Reproducible Finance with R\" on R View… https://t.co/fJBMn4zKNT"
## 
## [[2]]
## [1] "yuhangx: RT @dalejbarr: Sign up for this @PSstatistics course on #rstats for psych w/me &amp; @luc_bussiere on the banks of Loch Lomond! Apr2018 https:/…"
## 
## [[3]]
## [1] "MooresMt: RT @RLangTip: Get started with foreach and parallel programming: https://t.co/192iEfHCKc #rstats"
## 
## [[4]]
## [1] "BuzzNicholson: RT @sesync: Want to learn about geospatial data analysis? Apply for short course by Jan 5. #rstats https://t.co/qQssORUXwt https://t.co/VUw…"
## 
## [[5]]
## [1] "srharacha: RT @ucfagls: Smoothing discrete spatial data with a Markov random field smoother in #mgcv w #rstats\nhttps://t.co/Tm5Im3tt2z https://t.co/iA…"

Searching users

clinton <- getUser("hillaryclinton")
clinton$getDescription()
## [1] "Wife, mom, grandma, women+kids advocate, FLOTUS, Senator, SecState, hair icon, pantsuit aficionado, 2016 presidential candidate."
clinton$getFriends(n = 5)
## $`18622869`
## [1] "ezraklein"
## 
## $`2590811666`
## [1] "Color"
## 
## $`913945418005958656`
## [1] "TATLGDoc"
## 
## $`587536673`
## [1] "GiffordsCourage"
## 
## $`14868699`
## [1] "ScaryMommy"

Tidying tweets

str(tweets)
## List of 5
##  $ :Reference class 'status' [package "twitteR"] with 17 fields
##   ..$ text         : chr "#rstats Thank you @jkregenstein for a year's worth of illuminating posts on \"Reproducible Finance with R\" on "| __truncated__
##   ..$ favorited    : logi FALSE
##   ..$ favoriteCount: num 0
##   ..$ replyToSN    : chr(0) 
##   ..$ created      : POSIXct[1:1], format: "2017-11-10 19:19:17"
##   ..$ truncated    : logi TRUE
##   ..$ replyToSID   : chr(0) 
##   ..$ id           : chr "929065944768438272"
##   ..$ replyToUID   : chr(0) 
##   ..$ statusSource : chr "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>"
##   ..$ screenName   : chr "RStudioJoe"
##   ..$ retweetCount : num 0
##   ..$ isRetweet    : logi FALSE
##   ..$ retweeted    : logi FALSE
##   ..$ longitude    : chr(0) 
##   ..$ latitude     : chr(0) 
##   ..$ urls         :'data.frame':    1 obs. of  5 variables:
##   .. ..$ url         : chr "https://t.co/fJBMn4zKNT"
##   .. ..$ expanded_url: chr "https://twitter.com/i/web/status/929065944768438272"
##   .. ..$ display_url : chr "twitter.com/i/web/status/9…"
##   .. ..$ start_index : num 117
##   .. ..$ stop_index  : num 140
##   ..and 53 methods, of which 39 are  possibly relevant:
##   ..  getCreated, getFavoriteCount, getFavorited, getId, getIsRetweet,
##   ..  getLatitude, getLongitude, getReplyToSID, getReplyToSN,
##   ..  getReplyToUID, getRetweetCount, getRetweeted, getRetweeters,
##   ..  getRetweets, getScreenName, getStatusSource, getText, getTruncated,
##   ..  getUrls, initialize, setCreated, setFavoriteCount, setFavorited,
##   ..  setId, setIsRetweet, setLatitude, setLongitude, setReplyToSID,
##   ..  setReplyToSN, setReplyToUID, setRetweetCount, setRetweeted,
##   ..  setScreenName, setStatusSource, setText, setTruncated, setUrls,
##   ..  toDataFrame, toDataFrame#twitterObj
##  $ :Reference class 'status' [package "twitteR"] with 17 fields
##   ..$ text         : chr "RT @dalejbarr: Sign up for this @PSstatistics course on #rstats for psych w/me &amp; @luc_bussiere on the banks"| __truncated__
##   ..$ favorited    : logi FALSE
##   ..$ favoriteCount: num 0
##   ..$ replyToSN    : chr(0) 
##   ..$ created      : POSIXct[1:1], format: "2017-11-10 19:18:43"
##   ..$ truncated    : logi FALSE
##   ..$ replyToSID   : chr(0) 
##   ..$ id           : chr "929065805257551875"
##   ..$ replyToUID   : chr(0) 
##   ..$ statusSource : chr "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>"
##   ..$ screenName   : chr "yuhangx"
##   ..$ retweetCount : num 9
##   ..$ isRetweet    : logi TRUE
##   ..$ retweeted    : logi FALSE
##   ..$ longitude    : chr(0) 
##   ..$ latitude     : chr(0) 
##   ..$ urls         :'data.frame':    0 obs. of  4 variables:
##   .. ..$ url         : chr(0) 
##   .. ..$ expanded_url: chr(0) 
##   .. ..$ dispaly_url : chr(0) 
##   .. ..$ indices     : num(0) 
##   ..and 53 methods, of which 39 are  possibly relevant:
##   ..  getCreated, getFavoriteCount, getFavorited, getId, getIsRetweet,
##   ..  getLatitude, getLongitude, getReplyToSID, getReplyToSN,
##   ..  getReplyToUID, getRetweetCount, getRetweeted, getRetweeters,
##   ..  getRetweets, getScreenName, getStatusSource, getText, getTruncated,
##   ..  getUrls, initialize, setCreated, setFavoriteCount, setFavorited,
##   ..  setId, setIsRetweet, setLatitude, setLongitude, setReplyToSID,
##   ..  setReplyToSN, setReplyToUID, setRetweetCount, setRetweeted,
##   ..  setScreenName, setStatusSource, setText, setTruncated, setUrls,
##   ..  toDataFrame, toDataFrame#twitterObj
##  $ :Reference class 'status' [package "twitteR"] with 17 fields
##   ..$ text         : chr "RT @RLangTip: Get started with foreach and parallel programming: https://t.co/192iEfHCKc #rstats"
##   ..$ favorited    : logi FALSE
##   ..$ favoriteCount: num 0
##   ..$ replyToSN    : chr(0) 
##   ..$ created      : POSIXct[1:1], format: "2017-11-10 19:18:43"
##   ..$ truncated    : logi FALSE
##   ..$ replyToSID   : chr(0) 
##   ..$ id           : chr "929065803672104960"
##   ..$ replyToUID   : chr(0) 
##   ..$ statusSource : chr "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>"
##   ..$ screenName   : chr "MooresMt"
##   ..$ retweetCount : num 8
##   ..$ isRetweet    : logi TRUE
##   ..$ retweeted    : logi FALSE
##   ..$ longitude    : chr(0) 
##   ..$ latitude     : chr(0) 
##   ..$ urls         :'data.frame':    1 obs. of  5 variables:
##   .. ..$ url         : chr "https://t.co/192iEfHCKc"
##   .. ..$ expanded_url: chr "https://cran.r-project.org/web/packages/doParallel/vignettes/gettingstartedParallel.pdf"
##   .. ..$ display_url : chr "cran.r-project.org/web/packages/d…"
##   .. ..$ start_index : num 65
##   .. ..$ stop_index  : num 88
##   ..and 53 methods, of which 39 are  possibly relevant:
##   ..  getCreated, getFavoriteCount, getFavorited, getId, getIsRetweet,
##   ..  getLatitude, getLongitude, getReplyToSID, getReplyToSN,
##   ..  getReplyToUID, getRetweetCount, getRetweeted, getRetweeters,
##   ..  getRetweets, getScreenName, getStatusSource, getText, getTruncated,
##   ..  getUrls, initialize, setCreated, setFavoriteCount, setFavorited,
##   ..  setId, setIsRetweet, setLatitude, setLongitude, setReplyToSID,
##   ..  setReplyToSN, setReplyToUID, setRetweetCount, setRetweeted,
##   ..  setScreenName, setStatusSource, setText, setTruncated, setUrls,
##   ..  toDataFrame, toDataFrame#twitterObj
##  $ :Reference class 'status' [package "twitteR"] with 17 fields
##   ..$ text         : chr "RT @sesync: Want to learn about geospatial data analysis? Apply for short course by Jan 5. #rstats https://t.co"| __truncated__
##   ..$ favorited    : logi FALSE
##   ..$ favoriteCount: num 0
##   ..$ replyToSN    : chr(0) 
##   ..$ created      : POSIXct[1:1], format: "2017-11-10 19:15:47"
##   ..$ truncated    : logi FALSE
##   ..$ replyToSID   : chr(0) 
##   ..$ id           : chr "929065065982119936"
##   ..$ replyToUID   : chr(0) 
##   ..$ statusSource : chr "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>"
##   ..$ screenName   : chr "BuzzNicholson"
##   ..$ retweetCount : num 3
##   ..$ isRetweet    : logi TRUE
##   ..$ retweeted    : logi FALSE
##   ..$ longitude    : chr(0) 
##   ..$ latitude     : chr(0) 
##   ..$ urls         :'data.frame':    1 obs. of  5 variables:
##   .. ..$ url         : chr "https://t.co/qQssORUXwt"
##   .. ..$ expanded_url: chr "https://www.sesync.org/opportunities/short-courses/geospatial-data-analysis-short-course"
##   .. ..$ display_url : chr "sesync.org/opportunities/…"
##   .. ..$ start_index : num 99
##   .. ..$ stop_index  : num 122
##   ..and 53 methods, of which 39 are  possibly relevant:
##   ..  getCreated, getFavoriteCount, getFavorited, getId, getIsRetweet,
##   ..  getLatitude, getLongitude, getReplyToSID, getReplyToSN,
##   ..  getReplyToUID, getRetweetCount, getRetweeted, getRetweeters,
##   ..  getRetweets, getScreenName, getStatusSource, getText, getTruncated,
##   ..  getUrls, initialize, setCreated, setFavoriteCount, setFavorited,
##   ..  setId, setIsRetweet, setLatitude, setLongitude, setReplyToSID,
##   ..  setReplyToSN, setReplyToUID, setRetweetCount, setRetweeted,
##   ..  setScreenName, setStatusSource, setText, setTruncated, setUrls,
##   ..  toDataFrame, toDataFrame#twitterObj
##  $ :Reference class 'status' [package "twitteR"] with 17 fields
##   ..$ text         : chr "RT @ucfagls: Smoothing discrete spatial data with a Markov random field smoother in #mgcv w #rstats\nhttps://t."| __truncated__
##   ..$ favorited    : logi FALSE
##   ..$ favoriteCount: num 0
##   ..$ replyToSN    : chr(0) 
##   ..$ created      : POSIXct[1:1], format: "2017-11-10 19:15:45"
##   ..$ truncated    : logi FALSE
##   ..$ replyToSID   : chr(0) 
##   ..$ id           : chr "929065058088378368"
##   ..$ replyToUID   : chr(0) 
##   ..$ statusSource : chr "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>"
##   ..$ screenName   : chr "srharacha"
##   ..$ retweetCount : num 13
##   ..$ isRetweet    : logi TRUE
##   ..$ retweeted    : logi FALSE
##   ..$ longitude    : chr(0) 
##   ..$ latitude     : chr(0) 
##   ..$ urls         :'data.frame':    1 obs. of  5 variables:
##   .. ..$ url         : chr "https://t.co/Tm5Im3tt2z"
##   .. ..$ expanded_url: chr "http://www.fromthebottomoftheheap.net/2017/10/19/first-steps-with-mrf-smooths/"
##   .. ..$ display_url : chr "fromthebottomoftheheap.net/2017/10/19/fir…"
##   .. ..$ start_index : num 100
##   .. ..$ stop_index  : num 123
##   ..and 53 methods, of which 39 are  possibly relevant:
##   ..  getCreated, getFavoriteCount, getFavorited, getId, getIsRetweet,
##   ..  getLatitude, getLongitude, getReplyToSID, getReplyToSN,
##   ..  getReplyToUID, getRetweetCount, getRetweeted, getRetweeters,
##   ..  getRetweets, getScreenName, getStatusSource, getText, getTruncated,
##   ..  getUrls, initialize, setCreated, setFavoriteCount, setFavorited,
##   ..  setId, setIsRetweet, setLatitude, setLongitude, setReplyToSID,
##   ..  setReplyToSN, setReplyToUID, setRetweetCount, setRetweeted,
##   ..  setScreenName, setStatusSource, setText, setTruncated, setUrls,
##   ..  toDataFrame, toDataFrame#twitterObj

Tidying tweets

df <- twListToDF(tweets) %>%
  as_tibble()
df
## # A tibble: 5 x 16
##                                                                          text
##                                                                         <chr>
## 1 "#rstats Thank you @jkregenstein for a year's worth of illuminating posts o
## 2 RT @dalejbarr: Sign up for this @PSstatistics course on #rstats for psych w
## 3 RT @RLangTip: Get started with foreach and parallel programming: https://t.
## 4 RT @sesync: Want to learn about geospatial data analysis? Apply for short c
## 5 "RT @ucfagls: Smoothing discrete spatial data with a Markov random field sm
## # ... with 15 more variables: favorited <lgl>, favoriteCount <dbl>,
## #   replyToSN <lgl>, created <dttm>, truncated <lgl>, replyToSID <lgl>,
## #   id <chr>, replyToUID <lgl>, statusSource <chr>, screenName <chr>,
## #   retweetCount <dbl>, isRetweet <lgl>, retweeted <lgl>, longitude <lgl>,
## #   latitude <lgl>

Exercise: Practice using twitteR

Messy API responses

# omdb API function
omdb <- function(Key, Title, Year, Plot, Format){
  baseurl <- "http://www.omdbapi.com/?"
  params <- c("apikey=", "t=", "y=", "plot=", "r=")
  values <- c(Key, Title, Year, Plot, Format)
  param_values <- map2_chr(params, values, str_c)
  args <- str_c(param_values, collapse = "&")
  str_c(baseurl, args)
}

# use curl to execute the query
request_sharknado <- omdb(getOption("omdb_key"), "Sharknado", "2013", "short", "json")
con <- curl(request_sharknado)
answer_json <- readLines(con)
close(con)

# convert to data frame
answer_json %>% 
  fromJSON() %>% 
  as_tibble()
## Error: Column `Ratings` must be a 1d atomic vector or a list

Whoops

sharknado <- answer_json %>% 
  fromJSON()

str(sharknado)
## List of 25
##  $ Title     : chr "Sharknado"
##  $ Year      : chr "2013"
##  $ Rated     : chr "TV-14"
##  $ Released  : chr "11 Jul 2013"
##  $ Runtime   : chr "86 min"
##  $ Genre     : chr "Comedy, Horror, Sci-Fi"
##  $ Director  : chr "Anthony C. Ferrante"
##  $ Writer    : chr "Thunder Levin"
##  $ Actors    : chr "Ian Ziering, Tara Reid, John Heard, Cassandra Scerbo"
##  $ Plot      : chr "When a freak hurricane swamps Los Angeles, nature's deadliest killer rules sea, land, and air as thousands of s"| __truncated__
##  $ Language  : chr "English"
##  $ Country   : chr "USA"
##  $ Awards    : chr "1 win & 2 nominations."
##  $ Poster    : chr "https://images-na.ssl-images-amazon.com/images/M/MV5BOTE2OTk4MTQzNV5BMl5BanBnXkFtZTcwODUxOTM3OQ@@._V1_SX300.jpg"
##  $ Ratings   :'data.frame':  2 obs. of  2 variables:
##   ..$ Source: chr [1:2] "Internet Movie Database" "Rotten Tomatoes"
##   ..$ Value : chr [1:2] "3.3/10" "82%"
##  $ Metascore : chr "N/A"
##  $ imdbRating: chr "3.3"
##  $ imdbVotes : chr "38,948"
##  $ imdbID    : chr "tt2724064"
##  $ Type      : chr "movie"
##  $ DVD       : chr "03 Sep 2013"
##  $ BoxOffice : chr "N/A"
##  $ Production: chr "NCM Fathom"
##  $ Website   : chr "http://www.mtivideo.com/TitleView.aspx?TITLE_ID=728"
##  $ Response  : chr "True"
jsonedit(sharknado, mode = "view", elementId = "sharknado")

Inspecting and exploring lists

library(purrr)
library(repurrrsive)
str(got_chars, list.len = 3)
## List of 29
##  $ :List of 18
##   ..$ url        : chr "http://www.anapioficeandfire.com/api/characters/1022"
##   ..$ id         : int 1022
##   ..$ name       : chr "Theon Greyjoy"
##   .. [list output truncated]
##  $ :List of 18
##   ..$ url        : chr "http://www.anapioficeandfire.com/api/characters/1052"
##   ..$ id         : int 1052
##   ..$ name       : chr "Tyrion Lannister"
##   .. [list output truncated]
##  $ :List of 18
##   ..$ url        : chr "http://www.anapioficeandfire.com/api/characters/1074"
##   ..$ id         : int 1074
##   ..$ name       : chr "Victarion Greyjoy"
##   .. [list output truncated]
##   [list output truncated]
jsonedit(got_chars, mode = "view", elementId = "got_chars")

Name and position shortcuts

map(got_chars[1:4], "name")
## [[1]]
## [1] "Theon Greyjoy"
## 
## [[2]]
## [1] "Tyrion Lannister"
## 
## [[3]]
## [1] "Victarion Greyjoy"
## 
## [[4]]
## [1] "Will"
  • Equivalent to function(x) x[["TEXT"]]

Name and position shortcuts

map(got_chars[5:8], 3)
## [[1]]
## [1] "Areo Hotah"
## 
## [[2]]
## [1] "Chett"
## 
## [[3]]
## [1] "Cressen"
## 
## [[4]]
## [1] "Arianne Martell"
  • Equivalent to function(x) x[[i]]

Name and position shortcuts with pipe

got_chars %>% 
  map("name")
got_chars %>% 
  map(3)

Type-specific map

map_chr(got_chars[9:12], "name")
## [1] "Daenerys Targaryen" "Davos Seaworth"     "Arya Stark"        
## [4] "Arys Oakheart"
map_chr(got_chars[13:16], 3)
## [1] "Asha Greyjoy"    "Barristan Selmy" "Varamyr"         "Brandon Stark"

Extract multiple values

# Victarion element
got_chars[[3]]
## $url
## [1] "http://www.anapioficeandfire.com/api/characters/1074"
## 
## $id
## [1] 1074
## 
## $name
## [1] "Victarion Greyjoy"
## 
## $gender
## [1] "Male"
## 
## $culture
## [1] "Ironborn"
## 
## $born
## [1] "In 268 AC or before, at Pyke"
## 
## $died
## [1] ""
## 
## $alive
## [1] TRUE
## 
## $titles
## [1] "Lord Captain of the Iron Fleet" "Master of the Iron Victory"    
## 
## $aliases
## [1] "The Iron Captain"
## 
## $father
## [1] ""
## 
## $mother
## [1] ""
## 
## $spouse
## [1] ""
## 
## $allegiances
## [1] "House Greyjoy of Pyke"
## 
## $books
## [1] "A Game of Thrones" "A Clash of Kings"  "A Storm of Swords"
## 
## $povBooks
## [1] "A Feast for Crows"    "A Dance with Dragons"
## 
## $tvSeries
## list()
## 
## $playedBy
## list()
# specific elements for Victarion
got_chars[[3]][c("name", "culture", "gender", "born")]
## $name
## [1] "Victarion Greyjoy"
## 
## $culture
## [1] "Ironborn"
## 
## $gender
## [1] "Male"
## 
## $born
## [1] "In 268 AC or before, at Pyke"

Adapt to map() framework

map(.x, .f, ...)
  • .f = [
  • ... = character vector identifying the names of the elements to extract

Adapt to map() framework

x <- map(got_chars, `[`, c("name", "culture", "gender", "born"))
str(x[16:17])
## List of 2
##  $ :List of 4
##   ..$ name   : chr "Brandon Stark"
##   ..$ culture: chr "Northmen"
##   ..$ gender : chr "Male"
##   ..$ born   : chr "In 290 AC, at Winterfell"
##  $ :List of 4
##   ..$ name   : chr "Brienne of Tarth"
##   ..$ culture: chr ""
##   ..$ gender : chr "Female"
##   ..$ born   : chr "In 280 AC"

magrittr::extract()

library(magrittr)

x <- map(got_chars, extract, c("name", "culture", "gender", "born"))
str(x[18:19])
## List of 2
##  $ :List of 4
##   ..$ name   : chr "Catelyn Stark"
##   ..$ culture: chr "Rivermen"
##   ..$ gender : chr "Female"
##   ..$ born   : chr "In 264 AC, at Riverrun"
##  $ :List of 4
##   ..$ name   : chr "Cersei Lannister"
##   ..$ culture: chr "Westerman"
##   ..$ gender : chr "Female"
##   ..$ born   : chr "In 266 AC, at Casterly Rock"

Data frame output

map_df(got_chars, extract, c("name", "culture", "gender", "id", "born", "alive"))
## # A tibble: 29 x 6
##                  name  culture gender    id
##                 <chr>    <chr>  <chr> <int>
##  1      Theon Greyjoy Ironborn   Male  1022
##  2   Tyrion Lannister            Male  1052
##  3  Victarion Greyjoy Ironborn   Male  1074
##  4               Will            Male  1109
##  5         Areo Hotah Norvoshi   Male  1166
##  6              Chett            Male  1267
##  7            Cressen            Male  1295
##  8    Arianne Martell  Dornish Female   130
##  9 Daenerys Targaryen Valyrian Female  1303
## 10     Davos Seaworth Westeros   Male  1319
## # ... with 19 more rows, and 2 more variables: born <chr>, alive <lgl>

More robust approach

got_chars %>% {
  tibble(
    name = map_chr(., "name"),
    culture = map_chr(., "culture"),
    gender = map_chr(., "gender"),       
    id = map_int(., "id"),
    born = map_chr(., "born"),
    alive = map_lgl(., "alive")
  )
}
## # A tibble: 29 x 6
##                  name  culture gender    id
##                 <chr>    <chr>  <chr> <int>
##  1      Theon Greyjoy Ironborn   Male  1022
##  2   Tyrion Lannister            Male  1052
##  3  Victarion Greyjoy Ironborn   Male  1074
##  4               Will            Male  1109
##  5         Areo Hotah Norvoshi   Male  1166
##  6              Chett            Male  1267
##  7            Cressen            Male  1295
##  8    Arianne Martell  Dornish Female   130
##  9 Daenerys Targaryen Valyrian Female  1303
## 10     Davos Seaworth Westeros   Male  1319
## # ... with 19 more rows, and 2 more variables: born <chr>, alive <lgl>

Exercise: simplify gh_users

List inside a data frame

str(gh_repos, list.len = 2)
## List of 6
##  $ :List of 30
##   ..$ :List of 68
##   .. ..$ id               : int 61160198
##   .. ..$ name             : chr "after"
##   .. .. [list output truncated]
##   ..$ :List of 68
##   .. ..$ id               : int 40500181
##   .. ..$ name             : chr "argufy"
##   .. .. [list output truncated]
##   .. [list output truncated]
##  $ :List of 30
##   ..$ :List of 68
##   .. ..$ id               : int 14756210
##   .. ..$ name             : chr "2013-11_sfu"
##   .. .. [list output truncated]
##   ..$ :List of 68
##   .. ..$ id               : int 14152301
##   .. ..$ name             : chr "2014-01-27-miami"
##   .. .. [list output truncated]
##   .. [list output truncated]
##   [list output truncated]
jsonedit(gh_repos, mode = "view", elementId = "gh_repos")

Vector input to extraction shortcuts

gh_repos %>%
  map_chr(c(1, 3))
## [1] "gaborcsardi/after"   "jennybc/2013-11_sfu" "jtleek/advdatasci"  
## [4] "juliasilge/2016-14"  "leeper/ampolcourse"  "masalmon/aqi_pdf"

Get it into a data frame

One row per repository, with variables identifying which GitHub user owns it, the repository name, etc.

Create a data frame with usernames and gh_repos

(unames <- map_chr(gh_repos, c(1, 4, 1)))
## [1] "gaborcsardi" "jennybc"     "jtleek"      "juliasilge"  "leeper"     
## [6] "masalmon"
(udf <- gh_repos %>%
    set_names(unames) %>% 
    enframe("username", "gh_repos"))
## # A tibble: 6 x 2
##      username    gh_repos
##         <chr>      <list>
## 1 gaborcsardi <list [30]>
## 2     jennybc <list [30]>
## 3      jtleek <list [30]>
## 4  juliasilge <list [26]>
## 5      leeper <list [30]>
## 6    masalmon <list [30]>

How many repos are associated with each user?

udf %>% 
  mutate(n_repos = map_int(gh_repos, length))
## # A tibble: 6 x 3
##      username    gh_repos n_repos
##         <chr>      <list>   <int>
## 1 gaborcsardi <list [30]>      30
## 2     jennybc <list [30]>      30
## 3      jtleek <list [30]>      30
## 4  juliasilge <list [26]>      26
## 5      leeper <list [30]>      30
## 6    masalmon <list [30]>      30

Practice on a single user

# one_user is a list of repos for one user
one_user <- udf$gh_repos[[1]]

# one_user[[1]] is a list of info for one repo
one_repo <- one_user[[1]]
str(one_repo, max.level = 1, list.len = 5)
## List of 68
##  $ id               : int 61160198
##  $ name             : chr "after"
##  $ full_name        : chr "gaborcsardi/after"
##  $ owner            :List of 17
##  $ private          : logi FALSE
##   [list output truncated]
# a highly selective list of tibble-worthy info for one repo
one_repo[c("name", "fork", "open_issues")]
## $name
## [1] "after"
## 
## $fork
## [1] FALSE
## 
## $open_issues
## [1] 0
# make a data frame of that info for all a user's repos
map_df(one_user, `[`, c("name", "fork", "open_issues"))
## # A tibble: 30 x 3
##           name  fork open_issues
##          <chr> <lgl>       <int>
##  1       after FALSE           0
##  2      argufy FALSE           6
##  3         ask FALSE           4
##  4 baseimports FALSE           0
##  5      citest  TRUE           0
##  6  clisymbols FALSE           0
##  7      cmaker  TRUE           0
##  8       cmark  TRUE           0
##  9  conditions  TRUE           0
## 10      crayon FALSE           7
## # ... with 20 more rows
map_df(one_user, extract, c("name", "fork", "open_issues"))
## # A tibble: 30 x 3
##           name  fork open_issues
##          <chr> <lgl>       <int>
##  1       after FALSE           0
##  2      argufy FALSE           6
##  3         ask FALSE           4
##  4 baseimports FALSE           0
##  5      citest  TRUE           0
##  6  clisymbols FALSE           0
##  7      cmaker  TRUE           0
##  8       cmark  TRUE           0
##  9  conditions  TRUE           0
## 10      crayon FALSE           7
## # ... with 20 more rows

Scale up to all users

Scale up to all users

udf %>% 
  mutate(repo_info = gh_repos %>%
           map(. %>%
                 map_df(extract, c("name", "fork", "open_issues"))))
## # A tibble: 6 x 3
##      username    gh_repos         repo_info
##         <chr>      <list>            <list>
## 1 gaborcsardi <list [30]> <tibble [30 x 3]>
## 2     jennybc <list [30]> <tibble [30 x 3]>
## 3      jtleek <list [30]> <tibble [30 x 3]>
## 4  juliasilge <list [26]> <tibble [26 x 3]>
## 5      leeper <list [30]> <tibble [30 x 3]>
## 6    masalmon <list [30]> <tibble [30 x 3]>

Tidy the data frame

(rdf <- udf %>% 
   mutate(
     repo_info = gh_repos %>%
       map(. %>%
             map_df(extract, c("name", "fork", "open_issues")))
   ) %>% 
   select(-gh_repos) %>% 
   tidyr::unnest())
## # A tibble: 176 x 4
##       username        name  fork open_issues
##          <chr>       <chr> <lgl>       <int>
##  1 gaborcsardi       after FALSE           0
##  2 gaborcsardi      argufy FALSE           6
##  3 gaborcsardi         ask FALSE           4
##  4 gaborcsardi baseimports FALSE           0
##  5 gaborcsardi      citest  TRUE           0
##  6 gaborcsardi  clisymbols FALSE           0
##  7 gaborcsardi      cmaker  TRUE           0
##  8 gaborcsardi       cmark  TRUE           0
##  9 gaborcsardi  conditions  TRUE           0
## 10 gaborcsardi      crayon FALSE           7
## # ... with 166 more rows